import altair as alt
import pandas as pd
from altair import datumhw7
Homework 7
Import Packages
alt.data_transformers.disable_max_rows()DataTransformerRegistry.enable('default')
Part 1
gas_gap_data_url = "https://calvin-data304.netlify.app/data/pump_price_for_gasoline_us_per_liter.csv"
gas_gap_data = pd.read_csv(gas_gap_data_url)
gas_gap_data.head()| country | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | ... | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 1.05 | NaN | 1.15 | NaN | 1.28 | NaN | 1.07 | NaN | 0.7 |
| 1 | Angola | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.38 | NaN | ... | NaN | 0.53 | NaN | 0.65 | NaN | 0.63 | NaN | 0.76 | NaN | 0.97 |
| 2 | Albania | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.86 | NaN | ... | NaN | 1.36 | NaN | 1.46 | NaN | 1.81 | NaN | 1.76 | NaN | 1.36 |
| 3 | Andorra | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 1.24 | NaN | 1.49 | NaN | 1.67 | NaN | 1.51 | NaN | NaN |
| 4 | UAE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.23 | NaN | ... | NaN | 0.45 | NaN | 0.47 | NaN | 0.47 | NaN | 0.47 | NaN | 0.49 |
5 rows × 27 columns
We account for several label discrepencies between the datasets.
gas_gap_data.at[35, 'country'] = "Dem. Rep. Congo"
gas_gap_data.at[172, 'country'] = "United States of America"
gas_gap_data.at[28, 'country'] = "Central African Rep."
gas_gap_data.at[58, 'country'] = "United Kingdom"
gas_gap_data.at[152, 'country'] = "S. Sudan"
gas_gap_data.at[85, 'country'] = "Kyrgyzstan"
gas_gap_data.at[19, 'country'] = "Bosnia and Herz."
gas_gap_data.at[33, 'country'] = "Côte d'Ivoire"
gas_gap_data.at[4, 'country'] = "United Arab Emirates"
gas_gap_data.at[46, 'country'] = "Dominican Rep."
gas_gap_data.at[108, 'country'] = "Macedonia"countries = alt.topo_feature('https://cdn.jsdelivr.net/npm/world-atlas@2/countries-110m.json', feature='countries')
country_map = alt.Chart(countries).mark_geoshape(
fill='#aaaaaa',
stroke='#000000'
).project('mercator')
country_map.properties(width = 600, height = 400)country_map.transform_lookup(
lookup='properties.name',
from_=alt.LookupData(gas_gap_data, 'country', ['2012'])
).encode(
fill = "2012:Q",
tooltip = ["properties.name:O", "2012:Q"]
).properties(width = 600, height = 400, title="Gas rate: US $ per liter")Part 2
democracy_url = "https://calvin-data304.netlify.app/data/wvs.csv"
democracy_data = pd.read_csv(democracy_url)Wrangle the total number of respondants for each nation
respondants_per_country = pd.DataFrame(democracy_data.value_counts("country"))
respondants_per_country.reset_index(inplace=True)
respondants_per_country = respondants_per_country.rename(columns={0:"total"})base = alt.Chart(respondants_per_country).encode(
alt.X(field ='country', type="ordinal", sort="-y"),
alt.Y(field ='total', type="quantitative")
).properties(width=300,height=300,title="Number of Respondants")
base.mark_bar()Part 3
Age3
age3_facet = alt.Chart(democracy_data).mark_boxplot(extent="min-max").encode(
alt.X(field = "age", type = "quantitative", title = "Age in Years"),
alt.Y(field = "age3", type = "nominal"),
alt.Color("age3:N")
#alt.Tooltip(["min(age)", "max(age)"])
).properties(
width = 300, height = 75
).facet(
facet = "country:O",
columns=3
)
age3_facetAge6
age6_facet = alt.Chart(democracy_data).mark_boxplot(extent="min-max").encode(
alt.X(field = "age", type = "quantitative", title = "Age in Years"),
alt.Y(field = "age6", type = "nominal"),
alt.Color("age6:N")
).properties(
width = 300, height = 150
).facet(
facet = "country:O",
columns=3
)
age6_facetPart 4
Skipped. Will come back to later if time allows.
Part 5
lines = alt.Chart(democracy_data).mark_line().encode(
x=alt.X(field = "age6", type = "ordinal", sort = "-x"),
y=alt.Y("mean(democracy_importance):Q"),
).properties(
width = 200, height = 400
)
bands = alt.Chart(democracy_data).mark_errorband().encode(
x=alt.X(field = "age6", type = "ordinal", sort = "-x", title = "Age Grouping"),
y=alt.Y(field = "democracy_importance", type = "quantitative", title = "Average importance of democracy"),
).properties(
width = 200, height = 400,
title = ""
)
alt.layer(lines + bands).facet(
facet = "country:O"
)Part 6
lines = alt.Chart(democracy_data).mark_line().encode(
x=alt.X(field = "age", type = "ordinal", sort = "-x"),
y=alt.Y("mean(democracy_importance):Q"),
).properties(
width = 200, height = 400
)
bands = alt.Chart(democracy_data).mark_errorband().encode(
x=alt.X(field = "age", type = "ordinal", sort = "-x", title = "Age Grouping"),
y=alt.Y(field = "democracy_importance", type = "quantitative", title = "Average importance of democracy"),
).properties(
width = 200, height = 400,
title = ""
)
alt.layer(lines + bands).facet(
facet = "country:O"
)Using “age” instead of “age6” makes the graphic worse, because “age” will calculate the statistics for each of the ages present in the dataset. On the other hand, the “age6” is useful, because it bins each of the cases into 6 bins that are defined by an age range. This makes the plot much less chaotic and much more interpretible.
Part 7
loess_chart_base = alt.Chart(democracy_data).encode(
x=alt.X(field = "age6", type = "ordinal", sort = "-x", title = "Age Grouping"),
y=alt.Y(field = "democracy_importance", type = "quantitative", title = "Average importance of democracy"),
).properties(
width = 200, height = 400,
title = ""
)
loess_chart_base.transform_loess('age6', 'democracy_importance').mark_line().facet(
facet = "country:O"
)